import pandas as pd
from IPython.display import display
import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import numpy as np
customer = pd.read_csv('olist_customers_dataset.csv')
geolocation = pd.read_csv('olist_geolocation_dataset.csv')
orders = pd.read_csv('olist_orders_dataset.csv')
order_items = pd.read_csv('olist_order_items_dataset.csv')
order_payments = pd.read_csv('olist_order_payments_dataset.csv')
order_reviews = pd.read_csv('olist_order_reviews_dataset.csv')
products = pd.read_csv('olist_products_dataset.csv')
sellers = pd.read_csv('olist_sellers_dataset.csv')
english_desc = pd.read_csv('product_category_name_translation.csv')
#Taking all data frames in a list, so that we can perform specific operations on all of them by looping
df_list = [customer,geolocation,orders,order_items,order_payments,order_reviews,products,sellers]
#Looping using the display function imported above, because Jupyter doesnt print in tabular formats(true for every print)
for i in df_list:
display(i.head(5))
#Finding Attributes of each table with loops
df_names = ['customer','geolocation','orders','order_items','order_payments','order_reviews','products','sellers']
df_info_table = pd.DataFrame({}) # Creating an Empty Dataframe
df_info_table['Dataframe'] = df_names # Creating a Column with Dataframe(WIll work as in index)
df_info_table['Rows'] = [df.shape[0] for df in df_list] #List Comprehension
df_info_table['Columns'] = [df.shape[1] for df in df_list]
df_info_table['Null Value Count'] = [df.isnull().sum().sum() for df in df_list]
df_info_table['Null Columns'] = [len([col for col, null in df.isnull().sum().items() if null > 0]) for df in df_list]
df_info_table['Null Columns Name'] = [', '.join([col for col, null in df.isnull().sum().items() if null > 0]) for df in df_list]
#Chosing a Colour Paletter with Seaborn Library
cm = sns.light_palette("maroon", as_cmap=True)
#Mapping the style to the dataframe
df_info_table.style.background_gradient(cmap=cm)
customer.dtypes
#What is Customer Unique ID and Customer ID? We can Drop One of the Two Columns
len(customer.customer_unique_id.unique()),len(customer.customer_id.unique())
#Dropping "customer_unique_id"
customer.drop(['customer_unique_id'],axis=1).head(1) #Axis = 1 drops the column
orders.dtypes
orders.head(1)
orders.order_status.unique()
#We see that all the date and time stamps have a time associated with it we wont need the time for this project, hence we remove them
orders.columns
orders['order_purchase_date']= pd.to_datetime(orders['order_purchase_timestamp']).dt.date
orders['order_purchase_time']= pd.to_datetime(orders['order_purchase_timestamp']).dt.time
orders['order_delivered_time'] = pd.to_datetime(orders['order_delivered_customer_date']).dt.time
orders['order_delivered_date'] = pd.to_datetime(orders['order_delivered_customer_date']).dt.date
orders['order_estimated_delivery_time'] = pd.to_datetime(orders['order_estimated_delivery_date']).dt.time
orders['order_estimated_delivery_date'] = pd.to_datetime(orders['order_estimated_delivery_date']).dt.date
orders['order_approved_at'] = pd.to_datetime(orders['order_approved_at'])
orders['order_delivered_date'] = orders['order_delivered_date'].apply(pd.to_datetime)
orders['order_purchase_date'] = orders['order_purchase_date'].apply(pd.to_datetime)
orders.head(1)
orders.columns
#order approve vs purchase
orders=orders.drop(columns=['order_purchase_timestamp',
'order_approved_at',
'order_delivered_customer_date','order_purchase_time', 'order_delivered_time','order_estimated_delivery_time','order_delivered_carrier_date'])
orders.head()
order_items.dtypes
order_items.columns
order_items = order_items.drop(['shipping_limit_date'],axis=1)
order_items.head(1)
order_reviews.dtypes
#We only need the Review Score for this Project, Hence We Drop rest of the columns
order_reviews = order_reviews.drop(columns=['review_comment_title','review_comment_message','review_creation_date','review_answer_timestamp'],axis=1)
order_reviews.head(1)
products.dtypes
products.columns
products.head(1)
sellers.dtypes
#####################################
#Exporting
customer.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\customer.csv')
geolocation.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\geolocation.csv')
orders.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\orders.csv')
order_items.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\order_items.csv')
order_payments.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\order_payments.csv')
order_reviews.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\order_reviews.csv')
products.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\products.csv')
sellers.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\sellers.csv')
########################################
final = pd.merge(customer, orders, on ='customer_id')
final = pd.merge(final,order_items,on ='order_id')
final = pd.merge(final,order_payments,on ='order_id')
final = pd.merge(final,order_reviews,on ='order_id')
final = pd.merge(final,products,on ='product_id')
final = pd.merge(final,sellers,on ='seller_id')
final = pd.merge(final,english_desc,on='product_category_name')
final.dropna()
final.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\final.csv')
##########################################
###################################################
final.columns
df_info_table = pd.DataFrame({}) # Creating an Empty Dataframe
df_info_table['Dataframe'] = 'Final' # Creating a Column with Dataframe(WIll work as in index)
df_info_table['Rows'] = final.shape[0] #List Comprehension
df_info_table['Columns'] = final.shape[1]
df_info_table['Null Value Count'] = final.isnull().sum().sum()
df_info_table['Null Columns'] = [len([col for col, null in final.isnull().sum().items() if null > 0])]
df_info_table['Null Columns Name'] = [', '.join([col for col, null in final.isnull().sum().items() if null > 0])]
#Chosing a Colour Paletter with Seaborn Library
cm = sns.light_palette("maroon", as_cmap=True)
#Mapping the style to the dataframe
df_info_table.style.background_gradient(cmap=cm)
final.head()
final.dtypes
orders.dtypes
time_series_df = pd.DataFrame(columns=['Date','Orders'])
time_series_df['Date'] = orders['order_purchase_date'].dt.date
time_series_df['Orders']=1
time_series_df = time_series_df.groupby(['Date']).sum()
time_series_df = time_series_df.reset_index()
time_series_df['Year'] = orders['order_purchase_date'].dt.year
fig = px.histogram(time_series_df, x="Year", y="Orders", histfunc="sum", title="Orders By Year",color=time_series_df['Year'])
fig.update_traces(xbins_size=0)
fig.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M1", tickformat="Y")
fig.update_layout(bargap=0.5)
fig.show()
fig = px.histogram(time_series_df, x="Date", y="Orders", histfunc="sum", title="Orders By Month",color=time_series_df['Orders'])
fig.update_traces(xbins_size="M1")
fig.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M1", tickformat="%b\n%Y")
fig.update_layout(bargap=0.3)
fig.show()
order_items.columns
products.columns
orders.columns
time_series_category = order_items.merge(products,on=['product_id'],how='left')
time_series_category = time_series_category.merge(orders,on=['order_id'],how='left')
time_series_category['order'] = 1
time_series_category.columns
time_series_category = time_series_category[['product_category_name', 'order_purchase_date','order']]
time_series_category = time_series_category.groupby(['order_purchase_date','product_category_name']).sum().reset_index()
time_series_category.head(10)
fig = px.histogram(time_series_category, x='order_purchase_date', y="order", histfunc="sum", title="Orders By Month",color=time_series_category['product_category_name'])
fig.update_traces(xbins_size="M1")
fig.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M1", tickformat="%b\n%Y")
fig.update_layout(bargap=0.3)
fig.show()
fig = px.bar(time_series_df, x = 'Date',y="Orders",color='Orders',title='Orders By Date')
fig.show()
delivery_analysis = pd.DataFrame(columns =['Purchase','Delivered'])
delivery_analysis['Purchase'] = orders['order_purchase_date']
delivery_analysis['Delivered'] = orders['order_delivered_date']
delivery_analysis['Time Taken'] = (delivery_analysis['Delivered'] - delivery_analysis['Purchase']).dt.days
delivery_analysis = delivery_analysis.dropna()
delivery_analysis.head(5)
delivery_analysis['Time Taken'].describe()
'''
fig = go.Figure(data=[go.Box(y=delivery_analysis['Time Taken'],
boxpoints='all', # can also be outliers, or suspectedoutliers, or False
jitter=0.3, # add some jitter for a better separation between points
pointpos=-1.8 # relative position of points wrt box
)])
fig.show()
'''
#Interactive BoxPlot - Not an Optimal Code do not run
ax = sns.boxplot(y="Time Taken", data=delivery_analysis, whis=np.inf,color='cyan')
ax = sns.stripplot(y="Time Taken", data=delivery_analysis, color="maroon",alpha=.05)
city_analysis = pd.DataFrame(columns=[
'City', 'State', 'Time Taken','Total Price'])
city_analysis['City'] = final['customer_city']
city_analysis['State'] = final['customer_state']
city_analysis['Time Taken'] = (final['order_delivered_date'] - final['order_purchase_date']).dt.days
city_analysis['Cost'] = final['price']
city_analysis['Freight'] = final['freight_value']
city_analysis['Order'] = 1
city_analysis.head(1)
city_analysis = city_analysis.groupby('City').sum()
city_analysis['Freight Avg'] = city_analysis['Freight']/city_analysis['Order']
city_analysis['Time Taken'] = city_analysis['Time Taken']/city_analysis['Order']
city_analysis = city_analysis.dropna()
city_analysis.head(1)
city_analysis['Order'].describe()
print('Cities With less than 3 Orders = ',(city_analysis['Order'] <= 10).sum())
print('Cities With Greater than 10 Orders = ',(city_analysis['Order'] > 10).sum())
city_analysis['High/Lo'] = city_analysis['Order'].apply(lambda x: 'High' if x > 10 else 'Low')
city_analysis.head(1)
ax = sns.boxplot(x='High/Lo',y="Freight Avg", data=city_analysis, whis=np.inf)
ax = sns.boxplot(x='High/Lo',y="Freight Avg", data=city_analysis, whis=np.inf)
ax.set_yscale('log')
city_analysis['Time Taken'].describe()
city_analysis['Wait'] = city_analysis['Time Taken'].apply(lambda x: 'High' if x >= 21 else ('Low' if x <= 8 else 'Normal'))
x = sns.boxplot(x='Wait',y="Order", data=city_analysis, whis=np.inf)
x.set_yscale('log')
city_wait_time = customer.merge(orders,on='customer_id',how='left')
city_wait_time.columns
city_wait_time = city_wait_time[['customer_city', 'customer_state', 'order_id','order_purchase_date',
'order_delivered_date']]
city_wait_time['time_taken'] = (city_wait_time['order_delivered_date'] - city_wait_time['order_purchase_date']).dt.days
city_wait_time.columns
city_wait_time = city_wait_time[['customer_city', 'customer_state', 'time_taken']]
city_wait_time['total_order'] = 1
city_wait_time.head(1)
fig, ax = plt.subplots()
x = sns.boxplot(x='customer_state',y="time_taken", data=city_wait_time, whis=np.inf)
fig.set_size_inches(11.7, 8.27)
x.set_title("Time Variation By State",fontsize=20)
x.set_xlabel("Sate",fontsize=14)
x.set_ylabel("Time Taken In Days",fontsize=14)
fig, ax = plt.subplots()
g = sns.countplot(x='customer_state', data=final, orient='h')
g.set_title("Customer's State Distribution",fontsize=20)
g.set_xlabel("State",fontsize=14)
g.set_ylabel("Count",fontsize=14)
fig.set_size_inches(11.7, 8.27)
fig, ax = plt.subplots()
g = sns.countplot(x='seller_state', data=final, orient='h')
g.set_title("Sellers's State Distribution",fontsize=20)
g.set_xlabel("State",fontsize=14)
g.set_ylabel("Count",fontsize=14)
fig.set_size_inches(11.7, 8.27)
k=pd.DataFrame({'customers':customer['customer_state'].value_counts(),'sellers':sellers['seller_state'].value_counts()})
k=k.sort_values(by='customers',ascending=False)
k=k.fillna(0)
k['sellers']= k['sellers'].apply( lambda x:x/k['sellers'].sum())
k['customers']= k['customers'].apply( lambda x:x/k['customers'].sum())
labels = k.T.columns
sel = k['sellers']
cus = k['customers']
x = np.arange(len(labels)) # the label locations
width = 0.2 # the width of the bars
fig, ax = plt.subplots(figsize=(20,10))
rects1 = ax.bar(x - width/2, sel, width, label='Sellers')
rects2 = ax.bar(x + width/2, cus, width, label='Customers')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Percentage of total per state',fontsize=14)
ax.set_xlabel('States',fontsize=14)
ax.set_title('Customers and sellers location by state',fontsize=20)
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()
fig, ax = plt.subplots()
g3 = sns.boxplot(x='customer_state', y='freight_value',
data=final[final['price'] != -1])
g3.set_title("CUSTOMER's State by Freight Value", fontsize=20)
g3.set_xlabel("State",fontsize=14)
g3.set_ylabel("Freight Value",fontsize=14)
fig.set_size_inches(11.7, 8.27)
# The nicest state ? AP: macapa laranjal do jari
plt.figure(figsize=(14,8))
sns.barplot(data = final, x='customer_state', y='review_score',\
order = final.groupby('customer_state').mean()\
.reset_index().sort_values('review_score', ascending=False)['customer_state'].values,
errcolor = 'grey');
plt.title('Most Satisfied State'.title(), fontsize=20);
plt.ylabel('Average Review Score'.title(), fontsize=14);
plt.xlabel('customer state'.title(), fontsize=14);
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
sns.set(style="ticks")
import gc
import itertools
from datetime import datetime
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter("ignore")
pd.set_option('display.max_columns', 100)
np.random.seed(42)
import plotly
from datetime import datetime, timedelta
import plotly.offline as pyoff
import plotly.graph_objs as go
#initiate visualization library for jupyter notebook
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
pyoff.init_notebook_mode(connected=True)
%matplotlib inline
#defining visualizaition functions
def format_spines(ax, right_border=True):
ax.spines['bottom'].set_color('#666666')
ax.spines['left'].set_color('#666666')
ax.spines['top'].set_visible(False)
if right_border:
ax.spines['right'].set_color('#FFFFFF')
else:
ax.spines['right'].set_color('#FFFFFF')
ax.patch.set_facecolor('#FFFFFF')
def count_plot(feature, df, colors='Blues_d', hue=False, ax=None, title=''):
# Preparing variables
ncount = len(df)
if hue != False:
ax = sns.countplot(x=feature, data=df, palette=colors, hue=hue, ax=ax)
else:
ax = sns.countplot(x=feature, data=df, palette=colors, ax=ax)
format_spines(ax)
# Setting percentage
for p in ax.patches:
x=p.get_bbox().get_points()[:,0]
y=p.get_bbox().get_points()[1,1]
ax.annotate('{:.1f}%'.format(100.*y/ncount), (x.mean(), y),
ha='center', va='bottom') # set the alignment of the text
# Final configuration
if not hue:
ax.set_title(df[feature].describe().name + ' Analysis', size=13, pad=15)
else:
ax.set_title(df[feature].describe().name + ' Analysis by ' + hue, size=13, pad=15)
if title != '':
ax.set_title(title)
plt.tight_layout()
def bar_plot(x, y, df, colors='Blues_d', hue=False, ax=None, value=False, title=''):
# Preparing variables
try:
ncount = sum(df[y])
except:
ncount = sum(df[x])
#fig, ax = plt.subplots()
if hue != False:
ax = sns.barplot(x=x, y=y, data=df, palette=colors, hue=hue, ax=ax, ci=None)
else:
ax = sns.barplot(x=x, y=y, data=df, palette=colors, ax=ax, ci=None)
# Setting borders
format_spines(ax)
# Setting percentage
for p in ax.patches:
xp=p.get_bbox().get_points()[:,0]
yp=p.get_bbox().get_points()[1,1]
if value:
ax.annotate('{:.2f}k'.format(yp/1000), (xp.mean(), yp),
ha='center', va='bottom') # set the alignment of the text
else:
ax.annotate('{:.1f}%'.format(100.*yp/ncount), (xp.mean(), yp),
ha='center', va='bottom') # set the alignment of the text
if not hue:
ax.set_title(df[x].describe().name + ' Analysis', size=12, pad=15)
else:
ax.set_title(df[x].describe().name + ' Analysis by ' + hue, size=12, pad=15)
if title != '':
ax.set_title(title)
plt.tight_layout()
# loading data
customers_ = pd.read_csv("olist_customers_dataset.csv")
order_items_ = pd.read_csv("olist_order_items_dataset.csv")
order_payments_ = pd.read_csv("olist_order_payments_dataset.csv")
orders_ = pd.read_csv("olist_orders_dataset.csv")
# displaying data shape
#dataset = [customers, geolocation, order_items, order_payments, order_reviews, orders, products, sellers, category_name_translation]
dataset = {
'Customers': customers_,
'Order Items': order_items_,
'Payments': order_payments_,
'Orders': orders_
}
for x, y in dataset.items():
print(f'{x}', (list(y.shape)))
# displaying dataset column names
for x, y in dataset.items():
print(f'{x}', f'{list(y.columns)}\n')
# checking for null values in datasets
for x, y in dataset.items():
print(f'{x}: {y.isnull().any().any()}')
# taking count for dataset with missing values
for x, y in dataset.items():
if y.isnull().any().any():
print(f'{x}', (list(y.shape)),'\n')
print(f'{y.isnull().sum()}\n')
# creating master dataframe
order_payments_.head()
df1 = order_payments_.merge(order_items_, on='order_id')
df2 = df1.merge(orders_, on='order_id')
df = df2.merge(customers_, on='customer_id')
print(df.shape)
# converting date columns to datetime
date_columns = ['shipping_limit_date', 'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']
for col in date_columns:
df[col] = pd.to_datetime(df[col], format='%Y-%m-%d %H:%M:%S')
df['customer_city'] = df['customer_city'].str.title()
df['payment_type'] = df['payment_type'].str.replace('_', ' ').str.title()
# engineering new/essential columns
df['delivery_against_estimated'] = (df['order_estimated_delivery_date'] - df['order_delivered_customer_date']).dt.days
df['order_purchase_year'] = df.order_purchase_timestamp.apply(lambda x: x.year)
df['order_purchase_month'] = df.order_purchase_timestamp.apply(lambda x: x.month)
df['order_purchase_dayofweek'] = df.order_purchase_timestamp.apply(lambda x: x.dayofweek)
df['order_purchase_hour'] = df.order_purchase_timestamp.apply(lambda x: x.hour)
df['order_purchase_day'] = df['order_purchase_dayofweek'].map({0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'})
df['order_purchase_mon'] = df.order_purchase_timestamp.apply(lambda x: x.month).map({1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'Jun',7:'Jul',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'})
# Changing the month attribute for correct ordenation
df['month_year'] = df['order_purchase_month'].astype(str).apply(lambda x: '0' + x if len(x) == 1 else x)
df['month_year'] = df['order_purchase_year'].astype(str) + '-' + df['month_year'].astype(str)
#creating year month column
df['month_y'] = df['order_purchase_timestamp'].map(lambda date: 100*date.year + date.month)
# displaying summary staticstics of columns
# displaying summary staticstics of columns
df.describe(include='all')
# displaying missing value counts and corresponding percentage against total observations
missing_values = df.isnull().sum().sort_values(ascending = False)
percentage = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
pd.concat([missing_values, percentage], axis=1, keys=['Values', 'Percentage']).transpose()
# dropping missing values
df.dropna(inplace=True)
df.isnull().values.any()
# displaying dataframe info
df.info()
# excluding incomplete 2012 data and displaying first 3 rows of master dataframe
df = df.query("month_year != '2016-12' and month_year != '2016-10'")
df.head(3)
#calculate Revenue for each row and create a new dataframe with YearMonth - Revenue columns
df_revenue = df.groupby(['month_year'])['payment_value'].sum().reset_index()
df_revenue
#calculating for monthly revenie growth rate
# using pct_change() function to see monthly percentage change
df_revenue['MonthlyGrowth'] = df_revenue['payment_value'].pct_change()
df_revenue
#creating monthly active customers dataframe by counting unique Customer IDs
df_monthly_active = df.groupby('month_year')['customer_unique_id'].nunique().reset_index()
fig, ax = plt.subplots(figsize=(12, 6))
sns.set(palette='muted', color_codes=True, style='whitegrid')
bar_plot(x='month_year', y='customer_unique_id', df=df_monthly_active, value=True)
ax.tick_params(axis='x', labelrotation=90)
plt.show()
#creating monthly active customers dataframe by counting unique Customer IDs
df_monthly_sales = df.groupby('month_year')['order_status'].count().reset_index()
fig, ax = plt.subplots(figsize=(12, 6))
sns.set(palette='muted', color_codes=True, style='whitegrid')
bar_plot(x='month_year', y='order_status', df=df_monthly_sales, value=True)
ax.tick_params(axis='x', labelrotation=90)
plt.show()
# create a new dataframe for average revenue by taking the mean of it
df_monthly_order_avg = df.groupby('month_year')['payment_value'].mean().reset_index()
fig, ax = plt.subplots(figsize=(12, 6))
sns.set(palette='muted', color_codes=True, style='whitegrid')
bar_plot(x='month_year', y='payment_value', df=df_monthly_order_avg, value=True)
ax.tick_params(axis='x', labelrotation=90)
plt.show()
#create a dataframe contaning CustomerID and first purchase date
df_min_purchase = df.groupby('customer_unique_id').order_purchase_timestamp.min().reset_index()
df_min_purchase.columns = ['customer_unique_id','minpurchasedate']
df_min_purchase['minpurchasedate'] = df_min_purchase['minpurchasedate'].map(lambda date: 100*date.year + date.month)
#merge first purchase date column to our main dataframe (tx_uk)
df = pd.merge(df, df_min_purchase, on='customer_unique_id')
#create a column called User Type and assign Existing
#if User's First Purchase Year Month before the selected Invoice Year Month
df['usertype'] = 'New'
df.loc[df['month_y']>df['minpurchasedate'],'usertype'] = 'Existing'
#calculate the Revenue per month for each user type
df_user_type_revenue = df.groupby(['month_y','usertype', 'month_year'])['payment_value'].sum().reset_index()
df_user_type_revenue
fig, ax = plt.subplots(figsize=(15, 6))
sns.set(palette='muted', color_codes=True)
ax = sns.lineplot(x='month_year', y='payment_value', data=df_user_type_revenue.query("usertype == 'New'"), label='New')
ax = sns.lineplot(x='month_year', y='payment_value', data=df_user_type_revenue.query("usertype == 'Existing'"), label='Existing')
format_spines(ax, right_border=False)
ax.set_title('Existing vs New Customer Comparison')
ax.tick_params(axis='x', labelrotation=90)
plt.show()
#create a dataframe that shows new user ratio - we also need to drop NA values (first month new user ratio is 0)
df_user_ratio = df.query("usertype == 'New'").groupby(['month_year'])['customer_unique_id'].nunique()/df.query("usertype == 'Existing'").groupby(['month_year'])['customer_unique_id'].nunique()
df_user_ratio = df_user_ratio.reset_index()
#dropping nan values that resulted from first and last month
df_user_ratio = df_user_ratio.dropna()
df_user_ratio.columns = ['month_year','NewCusRatio']
#print the dafaframe
df_user_ratio
fig, ax = plt.subplots(figsize=(12, 6))
sns.set(palette='muted', color_codes=True, style='whitegrid')
bar_plot(x='month_year', y='NewCusRatio', df=df_user_ratio, value=True)
ax.tick_params(axis='x', labelrotation=90)
plt.show()
#Monthly Retention Rate = Retained Customers From Prev. Month/Active Customers Total (using crosstab)
#identifying active users are active by looking at their revenue per month
df_user_purchase = df.groupby(['customer_unique_id','month_y'])['payment_value'].sum().reset_index()
df_user_purchase.head()
#identifying active users are active by looking at their order count per month
df_user_purchase = df.groupby(['customer_unique_id','month_y'])['payment_value'].count().reset_index()
df_user_purchase.head()
#create retention matrix with crosstab using purchase
df_retention = pd.crosstab(df_user_purchase['customer_unique_id'], df_user_purchase['month_y']).reset_index()
df_retention.head()
#creating an array of dictionary which keeps Retained & Total User count for each month
months = df_retention.columns[2:]
retention_array = []
for i in range(len(months)-1):
retention_data = {}
selected_month = months[i+1]
prev_month = months[i]
retention_data['month_y'] = int(selected_month)
retention_data['TotalUserCount'] = df_retention[selected_month].sum()
retention_data['RetainedUserCount'] = df_retention[(df_retention[selected_month]>0) & (df_retention[prev_month]>0)][selected_month].sum()
retention_array.append(retention_data)
#convert the array to dataframe and calculate Retention Rate
df_retention = pd.DataFrame(retention_array)
df_retention['RetentionRate'] = df_retention['RetainedUserCount']/df_retention['TotalUserCount']
df_retention
fig, ax = plt.subplots(figsize=(12, 6))
sns.set(palette='muted', color_codes=True, style='whitegrid')
bar_plot(x='month_y', y='RetentionRate', df=df_retention, value=True)
ax.tick_params(axis='x', labelrotation=90)
plt.show()
Cohorts are determined as first purchase year-month of the customers. I will be measuring what percentage of the customers retained after their first purchase in each month. This view will help unveil how recent and old cohorts differ regarding retention rate and if recent changes in customer experience affected new customer’s retention or not.
#create our retention table again with crosstab() - we need to change the column names for using them in .query() function
df_retention = pd.crosstab(df_user_purchase['customer_unique_id'], df_user_purchase['month_y']).reset_index()
new_column_names = [ 'm_' + str(column) for column in df_retention.columns]
df_retention.columns = new_column_names
#create the array of Retained users for each cohort monthly
retention_array = []
for i in range(len(months)):
retention_data = {}
selected_month = months[i]
prev_months = months[:i]
next_months = months[i+1:]
for prev_month in prev_months:
retention_data[prev_month] = np.nan
total_user_count = retention_data['TotalUserCount'] = df_retention['m_' + str(selected_month)].sum()
retention_data[selected_month] = 1
query = "{} > 0".format('m_' + str(selected_month))
for next_month in next_months:
query = query + " and {} > 0".format(str('m_' + str(next_month)))
retention_data[next_month] = np.round(df_retention.query(query)['m_' + str(next_month)].sum()/total_user_count,2)
retention_array.append(retention_data)
#create the array of Retained users for each cohort monthly
retention_array = []
for i in range(len(months)):
retention_data = {}
selected_month = months[i]
prev_months = months[:i]
next_months = months[i+1:]
for prev_month in prev_months:
retention_data[prev_month] = np.nan
total_user_count = retention_data['TotalUserCount'] = df_retention['m_' + str(selected_month)].sum()
retention_data[selected_month] = 1
query = "{} > 0".format('m_' + str(selected_month))
for next_month in next_months:
query = query + " and {} > 0".format(str('m_' + str(next_month)))
retention_data[next_month] = np.round(df_retention.query(query)['m_' + str(next_month)].sum()/total_user_count,2)
retention_array.append(retention_data)
df_retention = pd.DataFrame(retention_array)
df_retention.index = months
#showing new cohort based retention table
df_retention
Customers who shop on Olist have different needs and they have their own different profile. We should adapt our actions depending on that.
RFM stands for Recency - Frequency - Monetary Value, I will be using this metrics to segment customers. Theoretically we will have segments like below: Low Value: Customers who are less active than others, not very frequent buyer/visitor and generates very low - zero - maybe negative revenue. Mid Value: In the middle of everything. Often using our platform (but not as much as our High Values), fairly frequent and generates moderate revenue. High Value: The group we don’t want to lose. High Revenue, Frequency and low Inactivity.
To calculate recency, we need to find out most recent purchase date of each customer and see for how many days they are inactive. After having no. of inactive days for each customer, we will apply K-means* clustering to assign customers a recency score.
#creates a generic user dataframe to keep CustomerID and new segmentation scores
df_user = pd.DataFrame(df['customer_unique_id'])
df_user.columns = ['customer_unique_id']
#gets the max purchase date for each customer and create a dataframe with it
df_max_purchase = df.groupby('customer_unique_id').order_purchase_timestamp.max().reset_index()
df_max_purchase.columns = ['customer_unique_id', 'MaxPurchaseDate']
#we take our observation point as the max purchase date in our dataset
df_max_purchase['Recency'] = (df_max_purchase['MaxPurchaseDate'].max() - df_max_purchase['MaxPurchaseDate']).dt.days
#merge this dataframe to our new user dataframe
df_user = pd.merge(df_user, df_max_purchase[['customer_unique_id','Recency']], on='customer_unique_id')
df_user.head()
# getting summary statistics of the recency table
df_user.Recency.describe()
# plotting the distribution of the continous feature set
sns.set(palette='muted', color_codes=True, style='white')
fig, ax = plt.subplots(figsize=(12, 6))
sns.despine(left=True)
sns.distplot(df_user['Recency'], bins=30)
plt.show()
from sklearn.cluster import KMeans
sse={}
df_recency = df_user[['Recency']]
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_recency)
df_recency["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure(figsize=(10, 5))
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
#building 5 clusters for recency and adding it to dataframe
kmeans = KMeans(n_clusters=5)
kmeans.fit(df_user[['Recency']])
df_user['RecencyCluster'] = kmeans.predict(df_user[['Recency']])
#function for ordering cluster numbers
def order_cluster(cluster_field_name, target_field_name,df,ascending):
new_cluster_field_name = 'new_' + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
df_final = df_final.drop([cluster_field_name],axis=1)
df_final = df_final.rename(columns={"index":cluster_field_name})
return df_final
df_user = order_cluster('RecencyCluster', 'Recency',df_user,False)
#displaying the details of each cluster
df_user.groupby('RecencyCluster')['Recency'].describe()
To create frequency clusters, i will need to find total number orders for each customer, after which i cango ahead and place them in various clusters
#get order counts for each user and create a dataframe with it
df_frequency = df.groupby('customer_unique_id').order_purchase_timestamp.count().reset_index()
df_frequency.columns = ['customer_unique_id','Frequency']
#add this data to our main dataframe
df_user = pd.merge(df_user, df_frequency, on='customer_unique_id')
# getting summary statistics of the recency table
df_user.Frequency.describe()
# plotting the distribution of the continous feature set
sns.set(palette='muted', color_codes=True, style='whitegrid')
fig, ax = plt.subplots(figsize=(12, 6))
sns.despine(left=True)
sns.distplot(df_user['Frequency'], hist=False)
plt.show()
#k-means
kmeans = KMeans(n_clusters=5)
kmeans.fit(df_user[['Frequency']])
df_user['FrequencyCluster'] = kmeans.predict(df_user[['Frequency']])
#order the frequency cluster
df_user = order_cluster('FrequencyCluster', 'Frequency',df_user,True)
#see details of each cluster
df_user.groupby('FrequencyCluster')['Frequency'].describe()
Let’s see how our customer base looks like when I cluster them based on revenue. I will calculate revenue for each customer, plot a histogram and apply the same clustering method.
#calculate revenue for each customer
df_revenue = df.groupby('customer_unique_id').payment_value.sum().reset_index()
#merge it with our main dataframe
df_user = pd.merge(df_user, df_revenue, on='customer_unique_id')
# plotting the distribution of the continous feature set
sns.set(palette='muted', color_codes=True, style='white')
fig, ax = plt.subplots(figsize=(12, 6))
sns.despine(left=True)
sns.distplot(df_user['payment_value'], hist=False)
plt.show()
sse={}
df_revenue = df_user[['payment_value']]
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_revenue)
df_revenue["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure(figsize=(10, 5))
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
#apply clustering
kmeans = KMeans(n_clusters=6)
kmeans.fit(df_user[['payment_value']])
df_user['RevenueCluster'] = kmeans.predict(df_user[['payment_value']])
#order the cluster numbers
df_user = order_cluster('RevenueCluster', 'payment_value',df_user,True)
#show details of the dataframe
df_user.groupby('RevenueCluster')['payment_value'].describe()
After creating various metric scores (cluster numbers) for recency, frequency & revenue. now i will proceed to create an overall score out of them:
#renaming columns
df_user.columns = ['customer_unique_id', 'Recency', 'RecencyCluster', 'Frequency', 'FrequencyCluster', 'Monetary', 'RevenueCluster']
#calculate overall score and use mean() to see details
df_user['OverallScore'] = df_user['RecencyCluster'] + df_user['FrequencyCluster'] + df_user['RevenueCluster']
df_user.groupby('OverallScore')['Recency','Frequency','Monetary'].mean()
df_user['Segment'] = 'Low-Value'
df_user.loc[df_user['OverallScore']>3,'Segment'] = 'Mid-Value'
df_user.loc[df_user['OverallScore']>6,'Segment'] = 'High-Value'
df_user.head()
# plotting the distribution of the continous feature set
sns.set(palette='muted', color_codes=True, style='whitegrid')
fig, axs = plt.subplots(1, 3, figsize=(22, 5))
sns.despine(left=True)
sns.scatterplot(x='Recency', y='Frequency', ax=axs[0], hue='Segment', data=df_user, size='Segment', sizes=(50,150), size_order=['High-Value','Mid-Value','Low-Value'])
sns.scatterplot(x='Frequency', y='Monetary', ax=axs[1], hue='Segment', data=df_user, size='Segment' , sizes=(50,150), size_order=['High-Value','Mid-Value','Low-Value'])
sns.scatterplot(x='Recency', y='Monetary', ax=axs[2], hue='Segment', data=df_user, size='Segment' , sizes=(50,150), size_order=['High-Value','Mid-Value','Low-Value'])
axs[0].set_title('Customer Segments by Recency & Frequency')
axs[1].set_title('Customer Segments by Frequency & Monetary')
axs[2].set_title('Customer Segments by Recency & Monetary')
plt.show()
pip install rpy2
%%R